library(data.table)
library(readxl)
library(ggplot2)
library(gprofiler2)
library(glmnet)
library(ComplexHeatmap)
library(pROC)
library(caret)
library(mixOmics)
library(infotheo)
library(circlize)
library(grDevices)
set.seed(101)
# read CD meta data
meta <- as.data.table(readxl::read_excel("data/transcritome_patients_translated.xlsx"))
meta_key <- "CEL_FILE"
setkeyv(meta, meta_key) # set key
stopifnot(!any( duplicated(meta[,..meta_key]) )) # check for duplicate rows
stopifnot(!any( duplicated(colnames(meta)) )) # check for duplicate columns
# read CD expression data
expr <- as.data.table(readxl::read_excel("data/transcriptome_expression_matrix.xlsx"))
New names:
expr_key <- "gene"
colnames(expr)[1] <- expr_key
setkeyv(expr, expr_key) # set key
stopifnot(!any( duplicated(expr[,..expr_key]) )) # check for duplicate rows
stopifnot(!any( duplicated(colnames(expr)) )) # check for duplicate columns
expr <- expr[, c(key(expr), meta$CEL_FILE), with=F]
stopifnot(all(colnames(expr)[-1]==meta[,CEL_FILE]))
expr <- as.matrix(expr, rownames="gene")
# generate different meta sheets for different sample types
# Ctrl -> Control samples
# M0 -> M0I samples
# M6 -> M6
# MI -> M0M
meta_M0 <- meta[LOCATION=="M0"]
meta_MI <- meta[LOCATION=="MI"]
meta_M6 <- meta[LOCATION=="M6"]
meta_M0M6 <- meta[LOCATION=="M6"|LOCATION=="M0"]
meta_M0MI <- meta[LOCATION=="M0"|LOCATION=="MI"]
meta_M0MIC <- meta[LOCATION=="M0"|LOCATION=="MI"|LOCATION=="Ctrl"]
# read mouse metabolic signature
signature_dt <- fread("data/signature.csv")
signature_mouse <- signature_dt$gene
setkeyv(signature_dt, "gene")
# read all mouse genes measured with nanostring
measured_genes_mouse <- fread("data/All samples_NormalizedData.csv")[[1]]
# check that all signature genes are part of the measured genes
stopifnot(all(signature_mouse %in% measured_genes_mouse))
# map to human orthologs
ortholog_mapping <- as.data.table(gprofiler2::gorth(measured_genes_mouse, source_organism = "mmusculus", target_organism = "hsapiens", filter_na = F))
stopifnot(all(measured_genes_mouse %in% ortholog_mapping$input))
# mark signature genes and hits in the human expr data
ortholog_mapping$in_expr <- ortholog_mapping$ortholog_name %in% rownames(expr)
ortholog_mapping$in_signature <- ortholog_mapping$input %in% signature_mouse
ortholog_mapping[in_signature==T,mouse:=signature_dt[input,pattern]]
# count number of hits in the human expr data per mouse gene
n_in_expr <- ortholog_mapping[,.("n_hits" = sum(in_expr)),by=input]
# print number of hits in expr$gene per measured mouse gene
table(n_in_expr$n_hits)
0 1 2 3 6 7
48 708 8 2 1 1
# select only mouse genes with orthologs uniquely mapped to the human expression data
unique_ortholog_mapping <- ortholog_mapping[input %in% n_in_expr[n_hits==1,input] & in_expr]
# unmapped signature genes
print(paste0(nrow(unique_ortholog_mapping)," out of ",length(measured_genes_mouse), " measured mouse genes were mapped to the human expression genes"))
[1] "708 out of 768 measured mouse genes were mapped to the human expression genes"
ortholog_mapping[!input %in% unique_ortholog_mapping$input & in_signature]
plot_data <- data.table(sample=colnames(expr))
plot_data[,colSum:=colSums(expr)]
plot_data[,RECURRENCE:=meta$RECURRENCE]
plot_data[,LOCATION:=meta$LOCATION]
plot_data[,GENDER:=meta$GENDER]
ggplot(plot_data,aes(x=colSum,fill=RECURRENCE)) +
geom_density(alpha=0.5)
ggplot(plot_data,aes(x=colSum,fill=LOCATION)) +
geom_density(alpha=0.5)
ggplot(plot_data,aes(x=colSum,fill=GENDER)) +
geom_density(alpha=0.5)
high_var_genes <- sort(apply(expr[,meta$CEL_FILE], 1, sd), decreasing = T)[1:10000]
high_var_expr <- t(expr[names(high_var_genes),meta$CEL_FILE])
high_var_pca <- pca(high_var_expr, ncomp = 3, scale = T)
plotIndiv(high_var_pca, group = meta$LOCATION, ind.names = FALSE,
legend = TRUE, title="PCA - high variance genes", ellipse = T)
signature_pca <- pca(t(expr[unique_ortholog_mapping[in_expr==T & in_signature, ortholog_name], meta$CEL_FILE]), ncomp = 3, scale = TRUE)
plotIndiv(signature_pca, group = meta$LOCATION, ind.names = FALSE,
legend = TRUE, title="PCA - signature genes", ellipse = T)
table(meta$RECURRENCE, meta$LOCATION)
Ctrl M0 M6 MI
Ctrl 25 0 0 0
NR 0 57 36 43
R 0 139 85 104
table(meta$LOCATION)
Ctrl M0 M6 MI
25 196 121 147
Ctrl (25) -> Ctrl (25) M0 (196) -> M0I (200) MI (147) -> M0M (149) M6 (121) -> M6 (122) Why do the numbers from publication and meta sheet not match?
What is CENTRE? What are stenose, fistule, inflammatoire, Stoma? What is Postoperative anti-TNF?
Why do RutgeertRec and RECURRENCE not match 1 to 1?
table(meta$RutgeertRec, meta$RECURRENCE)
Ctrl NR R
Ctrl 25 0 0
Rec 0 0 326
Rem 0 136 2
custom_heatmap <- function(expr, meta, genes, column_names = F, scale = T, ...){
if(is.null(meta)){
samples <- colnames(expr)
column_ha = NULL
}else{
meta <- as.data.frame(meta, row.names = meta[[1]])[,-1,drop=F]
samples <- rownames(meta)
column_ha = HeatmapAnnotation(df=meta)
}
genes <- as.data.frame(genes, row.names = genes[[1]])[,-1,drop=F]
gene_names <- rownames(genes)
row_ha = rowAnnotation(
df=genes,
annotation_name_side="top",
#annotation_label=("\u0394/\u0394IEC"),
col = list(mouse= c("up" = "#E8E700", "down" = "#0092F4"))
)
if(scale){
expr <- t(apply(expr[gene_names,samples],1,scale))
colnames(expr) <- samples
}else{
expr <- expr[gene_names,samples]
}
Heatmap(expr,
show_column_names = column_names,
top_annotation = column_ha,
name = "Expression",
row_names_gp = gpar(fontsize = 10),
left_annotation = row_ha,
column_title_side = "top",
...
)
}
custom_heatmap(
expr,
meta[LOCATION=="M0",.(CEL_FILE,RECURRENCE,LOCATION,inflammatoire,Smoker,Granuloma,Rutgeert2)],
unique_ortholog_mapping[in_expr==T&in_signature==T,.(ortholog_name,mouse)],
cluster_columns = T)
custom_heatmap(
expr,
meta[LOCATION=="M6",.(CEL_FILE,RECURRENCE,LOCATION,Reecal_Rut=as.numeric(Reeval_Rut))],
unique_ortholog_mapping[in_expr==T&in_signature==T,.(ortholog_name,mouse)],
cluster_columns = T)
custom_heatmap(
expr,
meta[LOCATION=="M6",.(CEL_FILE,RECURRENCE,LOCATION)][order(RECURRENCE)],
unique_ortholog_mapping[in_expr==T&in_signature==T,.(ortholog_name,mouse)],
cluster_columns = F)
custom_heatmap(
expr,
meta[LOCATION=="M0"|LOCATION=="MI",.(CEL_FILE,LOCATION)],
unique_ortholog_mapping[in_expr==T&in_signature==T,.(ortholog_name,mouse)],
cluster_columns = T)
custom_heatmap(
expr,
meta[LOCATION=="M0"|LOCATION=="MI",.(CEL_FILE,LOCATION)][order(LOCATION)],
unique_ortholog_mapping[in_expr==T&in_signature==T,.(ortholog_name,mouse)],
cluster_columns = F)
custom_heatmap(
expr,
meta[LOCATION=="M0"|LOCATION=="MI",.(CEL_FILE,LOCATION)][order(LOCATION)],
unique_ortholog_mapping[in_expr==T&in_signature==T,.(ortholog_name,mouse)][order(mouse)],
cluster_columns = F, cluster_rows = F)
NA
NA
(One column per sample group)
# MI, M0, Ctrl
expr_scaled <- apply(expr[unique_ortholog_mapping[in_expr==T&in_signature==T,ortholog_name],meta_M0MIC$CEL_FILE] ,1,scale)
rownames(expr_scaled) <- meta_M0MIC$CEL_FILE
summarized_expr <- sapply(unique(meta_M0MIC$LOCATION), function(location){
colMeans(expr_scaled[meta_M0MIC[LOCATION==location,CEL_FILE],])
})
# mutual information
concordance <- data.frame(
mouse = unique_ortholog_mapping[in_signature==T,mouse],
human = ifelse(sign(summarized_expr[unique_ortholog_mapping[in_signature==T,ortholog_name],"M0"])>0, "up", "down")
)
concordance$concordance <- concordance$mouse == concordance$human
mi <- mutinformation(concordance$mouse,concordance$human)
random_mi <- sapply(1:1000, function(i){mutinformation(sample(concordance$mouse),concordance$human)})
mi_pval <- (sum(random_mi>mi) + 1)/(length(random_mi)+1)
paste("concordant genes:",sum(concordance$concordance),"mutual information:",mi,"p-value",mi_pval)
[1] "concordant genes: 23 mutual information: 0.0923613499679372 p-value 0.010989010989011"
custom_heatmap(summarized_expr, NULL, unique_ortholog_mapping[in_signature==T,.(ortholog_name,mouse,concordance=concordance$concordance)] ,column_names=T, scale = F)
custom_heatmap(summarized_expr, NULL, unique_ortholog_mapping[in_signature==T,.(ortholog_name,mouse,concordance=concordance$concordance)][order(mouse)],cluster_rows = F, column_names=T, scale = F)
# MI, M0
expr_scaled <- apply(expr[unique_ortholog_mapping[in_expr==T&in_signature==T,ortholog_name],meta_M0MI$CEL_FILE] ,1,scale)
rownames(expr_scaled) <- meta_M0MI$CEL_FILE
summarized_expr <- sapply(unique(meta_M0MI$LOCATION), function(location){
colMeans(expr_scaled[meta_M0MI[LOCATION==location,CEL_FILE],])
})
# mutual information
concordance <- data.frame(
mouse = unique_ortholog_mapping[in_signature==T,mouse],
human = ifelse(sign(summarized_expr[unique_ortholog_mapping[in_signature==T,ortholog_name],"M0"])>0, "up", "down")
)
concordance$concordance <- concordance$mouse == concordance$human
mi <- mutinformation(concordance$mouse,concordance$human)
random_mi <- sapply(1:1000, function(i){mutinformation(sample(concordance$mouse),concordance$human)})
mi_pval <- (sum(random_mi>mi) + 1)/(length(random_mi)+1)
paste("concordant genes:",sum(concordance$concordance),"mutual information:",mi,"p-value",mi_pval)
[1] "concordant genes: 24 mutual information: 0.129536460802155 p-value 0.00799200799200799"
custom_heatmap(summarized_expr, NULL, unique_ortholog_mapping[in_signature==T,.(ortholog_name,mouse,concordance=concordance$concordance)] ,column_names=T, scale = F)
custom_heatmap(summarized_expr, NULL, unique_ortholog_mapping[in_signature==T,.(ortholog_name,mouse,concordance=concordance$concordance)][order(mouse)],cluster_rows = F, column_names=T, scale = F)
# scale expression data
expr_scaled <- apply(expr[unique_ortholog_mapping[in_signature==T,ortholog_name],meta_M0MI$CEL_FILE] ,1,scale)
rownames(expr_scaled) <- meta_M0MI$CEL_FILE
# mean per location
summarized_expr <- sapply(unique(c("MI","M0")), function(location){
colMeans(expr_scaled[meta_M0MI[LOCATION==location,CEL_FILE],])
})
# concordance data
concordance <- data.frame(
mouse = unique_ortholog_mapping[in_signature==T,mouse],
human = ifelse(sign(summarized_expr[unique_ortholog_mapping[in_signature==T,ortholog_name],"M0"])>0, "up", "down")
)
concordance$concordance <- concordance$mouse == concordance$human
mi <- mutinformation(concordance$mouse,concordance$human)
set.seed(0)
random_mi <- sapply(1:10000, function(i){mutinformation(sample(concordance$mouse),concordance$human)})
mi_pval <- (sum(random_mi>mi) + 1)/(length(random_mi)+1)
paste("concordant genes:",sum(concordance$concordance),"mutual information:",mi,"p-value",mi_pval)
[1] "concordant genes: 24 mutual information: 0.129536460802155 p-value 0.004999500049995"
concordance$Mice <- ifelse(concordance$mouse=="up", "up in \u0394/\u0394IEC", "down in \u0394/\u0394IEC")
#grDevices::cairo_pdf("heatmap.pdf", width = 4, height = 7,)
# row annotation
row_ha = rowAnnotation(
Mice=concordance$Mice,
annotation_name_side="top",
annotation_name_rot=0,
col = list(Mice= c("up in \u0394/\u0394IEC" = "#E8E700", "down in \u0394/\u0394IEC" = "#0092F4"))
)
col_fun = colorRamp2(c(-0.5, 0, 0.5), c("#0092F4", "black", "#E8E700"))
concordance$col <- "grey"
concordance[concordance$concordance,]$col <- "black"
Heatmap(summarized_expr,
name = "Expression",
row_names_gp = gpar(fontsize = 9, col = concordance$col),
left_annotation = row_ha,
column_title_side = "top",
cluster_columns = F,
column_labels = c("M0M","M0I"),
column_names_side = "top",
column_names_rot = 0,
column_names_centered = T,
col = col_fun,
width = unit(3, "cm"),
)
#dev.off()
high_var_genes <- sort(apply(expr[,meta_M0$CEL_FILE], 1, sd), decreasing = T)[1:5000]
pls_da_expr <- t(expr[names(high_var_genes),meta_M0$CEL_FILE])
#pls_da_expr <- 2 ^ pls_da_expr
pca.expr <- pca(pls_da_expr, ncomp = 3, scale = TRUE)
plotIndiv(pca.expr, group = meta_M0$RECURRENCE, ind.names = FALSE,
legend = TRUE,
title = 'PCA comp 1 - 2')
plsda.expr <- plsda(pls_da_expr, meta_M0$RECURRENCE, ncomp = 10)
perf.plsda.expr <- perf(plsda.expr, validation = 'Mfold', folds = 3,
progressBar = FALSE,
nrepeat = 10)
plot(perf.plsda.expr, sd = TRUE, legend.position = 'horizontal')
logistic_glmnet_loc <- function(meta, expr, signature){
x <- t(expr[signature, meta$CEL_FILE])
y <- meta$LOCATION
fit <- glmnet(x, y, family = "binomial")
plot(fit, label = T)
cvfit <- cv.glmnet(x, y, family = "binomial")
plot(cvfit)
print(cvfit)
print(coef(cvfit, s = "lambda.1se"))
}
logistic_glm_loc <- function(meta, expr, signature, roc=F, summary=F, performance=F){
x <- t(expr[signature, meta$CEL_FILE])
y <- meta$LOCATION
data <- as.data.frame(x)
data$LOCATION <- 0
data$LOCATION[y=="M0"] <- 1
glm_model <- glm(LOCATION ~.,family = "binomial", data)
# prediction
model_prob = predict(glm_model, type = "response")
model_pred = ifelse(model_prob > 0.5, "M0", "MI")
train_tab = table(predicted = model_pred, actual = y)
train_con_mat = confusionMatrix(train_tab)
if(summary){
print(summary(glm_model))
}
if(roc){
roc(y ~ model_prob, plot = TRUE, print.auc = TRUE)
}
if(performance){
print(train_con_mat)
}
train_con_mat$overall["Accuracy"]
}
signature_human <- unique_ortholog_mapping[in_signature==T,ortholog_name]
logistic_glmnet_loc(meta_M0MI, expr, signature_human)
Call: cv.glmnet(x = x, y = y, family = "binomial")
Measure: Binomial Deviance
Lambda Index Measure SE Nonzero
min 0.00653 39 1.085 0.03910 22
1se 0.05055 17 1.121 0.03892 7
33 x 1 sparse Matrix of class "dgCMatrix"
s1
(Intercept) 2.55205119
ALDOB .
APOA1 0.14498197
APOB .
APOC3 .
ARG1 .
ASNS .
CACNA1A .
CD274 .
CPS1 .
CREB3L3 0.02933740
CXCL9 .
DMGDH .
DUOX2 .
FAHD1 .
ICOS .
IDO1 .
INMT -0.10196269
KYAT3 .
NOS2 .
NOX1 .
OTC 0.07169543
PCK1 .
PDK4 .
PHGDH .
PSAT1 .
PTK6 .
RIMKLA .
SLC7A11 -0.21170216
SPIB .
STAT1 -0.27217305
TLR4 -0.05028450
TNF .
Logistic regression with full signature
# Analysis with glm
accuracy <- logistic_glm_loc(meta_M0MI, expr, signature_human, T, T, T)
Call:
glm(formula = LOCATION ~ ., family = "binomial", data = data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6509 -0.6866 0.1288 0.7103 2.5686
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 21.87211 14.56681 1.502 0.13323
ALDOB -2.40915 0.78284 -3.077 0.00209 **
APOA1 -0.39068 0.27320 -1.430 0.15271
APOB 0.20633 0.35331 0.584 0.55921
APOC3 0.36578 0.26479 1.381 0.16715
ARG1 -0.66917 0.86300 -0.775 0.43811
ASNS -0.04427 0.47465 -0.093 0.92568
CACNA1A 0.94422 0.51347 1.839 0.06593 .
CD274 -0.26257 0.32521 -0.807 0.41945
CPS1 1.70922 0.42632 4.009 6.09e-05 ***
CREB3L3 -0.21153 0.27957 -0.757 0.44927
CXCL9 -0.28598 0.22484 -1.272 0.20339
DMGDH -0.49668 0.90277 -0.550 0.58220
DUOX2 -0.07147 0.12958 -0.552 0.58128
FAHD1 -0.03254 0.53800 -0.060 0.95177
ICOS -0.15047 0.40333 -0.373 0.70910
IDO1 0.29994 0.31233 0.960 0.33688
INMT 0.25605 0.33474 0.765 0.44432
KYAT3 -0.60916 0.73320 -0.831 0.40607
NOS2 0.20450 0.28283 0.723 0.46965
NOX1 -0.88635 0.44853 -1.976 0.04814 *
OTC -1.03341 0.42154 -2.452 0.01423 *
PCK1 0.33806 0.18987 1.780 0.07500 .
PDK4 -0.12923 0.18824 -0.687 0.49238
PHGDH -0.59088 0.62564 -0.944 0.34495
PSAT1 -0.43509 0.36122 -1.205 0.22839
PTK6 0.41397 0.49678 0.833 0.40467
RIMKLA -0.18986 0.58764 -0.323 0.74663
SLC7A11 0.60062 0.27040 2.221 0.02634 *
SPIB -0.18824 0.26705 -0.705 0.48086
STAT1 0.68441 0.50918 1.344 0.17890
TLR4 0.68613 0.36408 1.885 0.05949 .
TNF -0.24646 0.42423 -0.581 0.56128
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 468.47 on 342 degrees of freedom
Residual deviance: 297.93 on 310 degrees of freedom
AIC: 363.93
Number of Fisher Scoring iterations: 7
Setting levels: control = M0, case = MI
Setting direction: controls > cases
Confusion Matrix and Statistics
actual
predicted M0 MI
M0 164 35
MI 32 112
Accuracy : 0.8047
95% CI : (0.7587, 0.8453)
No Information Rate : 0.5714
P-Value [Acc > NIR] : <2e-16
Kappa : 0.6002
Mcnemar's Test P-Value : 0.807
Sensitivity : 0.8367
Specificity : 0.7619
Pos Pred Value : 0.8241
Neg Pred Value : 0.7778
Prevalence : 0.5714
Detection Rate : 0.4781
Detection Prevalence : 0.5802
Balanced Accuracy : 0.7993
'Positive' Class : M0
Logistic regression with random signatures
random_signatures <- lapply(1:1000,function(i){
sample(unique_ortholog_mapping$ortholog_name,length(signature_human))
})
random_accuracy <- sapply(random_signatures, function(random_signature){
logistic_glm_loc(meta_M0MI, expr, random_signature)
})
ggplot(data.frame(random_accuracy=random_accuracy), aes(x=random_accuracy)) +
geom_histogram(bins=30) +
geom_vline(xintercept=accuracy, color="red")
(sum(random_accuracy>accuracy)+1)/(length(random_signatures)+1)
[1] 0.2217782
logistic_glmnet <- function(meta, expr, signature){
x <- t(expr[signature, meta$CEL_FILE])
y <- meta$RECURRENCE
fit <- glmnet(x, y, family = "binomial")
plot(fit, label = T)
cvfit <- cv.glmnet(x, y, family = "binomial", type.measure = "class")
plot(cvfit)
print(cvfit)
}
logistic_glm <- function(meta, expr, signature, roc=F, summary=F, performance=F){
x <- t(expr[signature, meta$CEL_FILE])
y <- meta$RECURRENCE
data <- as.data.frame(x)
data$RECURRENCE <- 0
data$RECURRENCE[y=="R"] <- 1
glm_model <- glm(RECURRENCE ~.,family = "binomial", data)
# prediction
model_prob = predict(glm_model, type = "response")
model_pred = ifelse(model_prob > 0.5, "R", "NR")
train_tab = table(predicted = model_pred, actual = y)
train_con_mat = confusionMatrix(train_tab)
if(summary){
print(summary(glm_model))
}
if(roc){
roc(y ~ model_prob, plot = TRUE, print.auc = TRUE)
}
if(performance){
print(train_con_mat)
}
train_con_mat$overall["Accuracy"]
}
signature_human <- unique_ortholog_mapping[in_signature==T,ortholog_name]
Feature selection with logistic glmnet
logistic_glmnet(meta_M0, expr, signature_human)
Call: cv.glmnet(x = x, y = y, type.measure = "class", family = "binomial")
Measure: Misclassification Error
Lambda Index Measure SE Nonzero
min 0.06086 1 0.2908 0.0275 0
1se 0.06086 1 0.2908 0.0275 0
Logistic regression with full signature
# Analysis with glm
accuracy <- logistic_glm(meta_M0, expr, signature_human, T, T, T)
Call:
glm(formula = RECURRENCE ~ ., family = "binomial", data = data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6967 -0.8290 0.5509 0.8016 1.8942
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -15.06772 15.79400 -0.954 0.3401
ALDOB -0.92023 0.41840 -2.199 0.0278 *
APOA1 0.10998 0.27791 0.396 0.6923
APOB 0.52203 0.30590 1.707 0.0879 .
APOC3 -0.27377 0.29954 -0.914 0.3607
ARG1 0.14516 1.07260 0.135 0.8923
ASNS -0.42582 0.53167 -0.801 0.4232
CACNA1A 0.16560 0.62152 0.266 0.7899
CD274 0.07812 0.37852 0.206 0.8365
CPS1 0.21568 0.37820 0.570 0.5685
CREB3L3 0.05876 0.34063 0.173 0.8630
CXCL9 0.25207 0.28752 0.877 0.3806
DMGDH -1.29012 0.89202 -1.446 0.1481
DUOX2 -0.05173 0.18021 -0.287 0.7741
FAHD1 0.41210 0.68509 0.602 0.5475
ICOS -0.50734 0.48449 -1.047 0.2950
IDO1 -0.24146 0.40907 -0.590 0.5550
INMT 0.74965 0.38851 1.930 0.0537 .
KYAT3 1.29103 0.80823 1.597 0.1102
NOS2 -0.06091 0.35451 -0.172 0.8636
NOX1 0.52374 0.55059 0.951 0.3415
OTC -0.03520 0.42550 -0.083 0.9341
PCK1 0.16418 0.17516 0.937 0.3486
PDK4 -0.13744 0.24200 -0.568 0.5701
PHGDH -0.16188 0.64987 -0.249 0.8033
PSAT1 -0.19922 0.44249 -0.450 0.6526
PTK6 -0.55283 0.65167 -0.848 0.3963
RIMKLA 0.94240 0.71040 1.327 0.1847
SLC7A11 0.42755 0.28598 1.495 0.1349
SPIB 0.20429 0.31458 0.649 0.5161
STAT1 0.80068 0.72341 1.107 0.2684
TLR4 -0.34050 0.39892 -0.854 0.3934
TNF -0.34636 0.39681 -0.873 0.3827
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 236.33 on 195 degrees of freedom
Residual deviance: 200.82 on 163 degrees of freedom
AIC: 266.82
Number of Fisher Scoring iterations: 5
Setting levels: control = NR, case = R
Setting direction: controls < cases
Confusion Matrix and Statistics
actual
predicted NR R
NR 19 9
R 38 130
Accuracy : 0.7602
95% CI : (0.6942, 0.8182)
No Information Rate : 0.7092
P-Value [Acc > NIR] : 0.06559
Kappa : 0.316
Mcnemar's Test P-Value : 4.423e-05
Sensitivity : 0.33333
Specificity : 0.93525
Pos Pred Value : 0.67857
Neg Pred Value : 0.77381
Prevalence : 0.29082
Detection Rate : 0.09694
Detection Prevalence : 0.14286
Balanced Accuracy : 0.63429
'Positive' Class : NR
Logistic regression with random signatures
random_signatures <- lapply(1:1000,function(i){
sample(unique_ortholog_mapping$ortholog_name,length(signature_human))
})
random_accuracy <- sapply(random_signatures, function(random_signature){
logistic_glm(meta_M0, expr, random_signature)
})
ggplot(data.frame(random_accuracy=random_accuracy), aes(x=random_accuracy)) +
geom_histogram(bins=30) +
geom_vline(xintercept=accuracy, color="red")
Feature selection with logistic glmnet
logistic_glmnet(meta_M6, expr, signature_human)
Call: cv.glmnet(x = x, y = y, type.measure = "class", family = "binomial")
Measure: Misclassification Error
Lambda Index Measure SE Nonzero
min 0.07089 8 0.2893 0.03628 5
1se 0.13596 1 0.2975 0.03363 0
Logistic regression with full signature
# Analysis with glm
accuracy <- logistic_glm(meta_M6, expr, signature_human, T, T, T)
Call:
glm(formula = RECURRENCE ~ ., family = "binomial", data = data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3392 -0.5410 0.2409 0.6472 1.6068
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -11.93445 27.08750 -0.441 0.65951
ALDOB 0.95027 1.21594 0.782 0.43450
APOA1 -1.62620 1.12759 -1.442 0.14925
APOB 0.95201 1.37919 0.690 0.49003
APOC3 0.27513 0.96583 0.285 0.77575
ARG1 -0.62188 2.15069 -0.289 0.77246
ASNS -1.23129 0.95416 -1.290 0.19689
CACNA1A -2.16046 1.20953 -1.786 0.07407 .
CD274 0.99056 0.86740 1.142 0.25346
CPS1 -0.13152 1.07094 -0.123 0.90226
CREB3L3 0.47578 0.61427 0.775 0.43861
CXCL9 0.18221 0.49859 0.365 0.71477
DMGDH -1.96184 2.60050 -0.754 0.45060
DUOX2 0.28120 0.28756 0.978 0.32814
FAHD1 3.23421 1.45507 2.223 0.02623 *
ICOS 0.47828 0.98646 0.485 0.62779
IDO1 -0.03772 0.79989 -0.047 0.96239
INMT -0.56813 1.15972 -0.490 0.62421
KYAT3 0.55285 1.49817 0.369 0.71212
NOS2 -1.08783 0.58724 -1.852 0.06396 .
NOX1 0.50637 0.96109 0.527 0.59828
OTC -1.05608 1.05991 -0.996 0.31906
PCK1 0.42407 0.54764 0.774 0.43872
PDK4 -1.22094 0.63454 -1.924 0.05434 .
PHGDH 5.37410 1.49335 3.599 0.00032 ***
PSAT1 -0.10896 0.70550 -0.154 0.87726
PTK6 -2.29695 1.14187 -2.012 0.04426 *
RIMKLA -0.63140 1.50560 -0.419 0.67495
SLC7A11 -0.21761 0.75281 -0.289 0.77253
SPIB -0.68279 0.81850 -0.834 0.40417
STAT1 -0.35248 0.99028 -0.356 0.72189
TLR4 -0.15202 0.82122 -0.185 0.85314
TNF 2.17205 1.21091 1.794 0.07286 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 147.317 on 120 degrees of freedom
Residual deviance: 94.893 on 88 degrees of freedom
AIC: 160.89
Number of Fisher Scoring iterations: 6
Setting levels: control = NR, case = R
Setting direction: controls < cases
Confusion Matrix and Statistics
actual
predicted NR R
NR 23 10
R 13 75
Accuracy : 0.8099
95% CI : (0.7286, 0.8755)
No Information Rate : 0.7025
P-Value [Acc > NIR] : 0.005027
Kappa : 0.5341
Mcnemar's Test P-Value : 0.676657
Sensitivity : 0.6389
Specificity : 0.8824
Pos Pred Value : 0.6970
Neg Pred Value : 0.8523
Prevalence : 0.2975
Detection Rate : 0.1901
Detection Prevalence : 0.2727
Balanced Accuracy : 0.7606
'Positive' Class : NR
Logistic regression with random signatures
random_signatures <- lapply(1:1000,function(i){
sample(unique_ortholog_mapping$ortholog_name,length(signature_human))
})
random_accuracy <- sapply(random_signatures, function(random_signature){
logistic_glm(meta_M6, expr, random_signature)
})
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurredWarning: glm.fit: fitted probabilities numerically 0 or 1 occurredWarning: glm.fit: fitted probabilities numerically 0 or 1 occurredWarning: glm.fit: fitted probabilities numerically 0 or 1 occurredWarning: glm.fit: fitted probabilities numerically 0 or 1 occurred
ggplot(data.frame(random_accuracy=random_accuracy), aes(x=random_accuracy)) +
geom_histogram(bins=30) +
geom_vline(xintercept=accuracy, color="red")
Feature selection with logistic glmnet
logistic_glmnet(meta_MI, expr, signature_human)
Call: cv.glmnet(x = x, y = y, type.measure = "class", family = "binomial")
Measure: Misclassification Error
Lambda Index Measure SE Nonzero
min 0.05889 1 0.2925 0.03973 0
1se 0.05889 1 0.2925 0.03973 0
Logistic regression with full signature
# Analysis with glm
accuracy <- logistic_glm(meta_MI, expr, signature_human, T, T, T)
Call:
glm(formula = RECURRENCE ~ ., family = "binomial", data = data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3934 -0.7820 0.4690 0.7313 2.2220
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.17530 24.79441 -0.168 0.8663
ALDOB 2.26021 1.58793 1.423 0.1546
APOA1 1.00416 0.53227 1.887 0.0592 .
APOB -2.01022 1.00052 -2.009 0.0445 *
APOC3 -0.30466 0.52124 -0.584 0.5589
ARG1 -1.85550 1.28265 -1.447 0.1480
ASNS 0.22660 0.75871 0.299 0.7652
CACNA1A 0.01282 0.77252 0.017 0.9868
CD274 0.56382 0.57507 0.980 0.3269
CPS1 0.53332 0.65796 0.811 0.4176
CREB3L3 -0.54022 0.47594 -1.135 0.2564
CXCL9 -0.30102 0.37615 -0.800 0.4236
DMGDH 0.41788 1.88106 0.222 0.8242
DUOX2 0.10864 0.19018 0.571 0.5678
FAHD1 0.36869 0.80406 0.459 0.6466
ICOS -1.39655 0.63234 -2.209 0.0272 *
IDO1 0.78642 0.52369 1.502 0.1332
INMT 0.43336 0.54341 0.797 0.4252
KYAT3 0.99032 1.16555 0.850 0.3955
NOS2 0.52479 0.46142 1.137 0.2554
NOX1 0.27130 0.85806 0.316 0.7519
OTC -0.83141 0.74064 -1.123 0.2616
PCK1 -0.21972 0.48399 -0.454 0.6498
PDK4 -0.11671 0.32671 -0.357 0.7209
PHGDH -0.37393 1.09492 -0.342 0.7327
PSAT1 -0.36391 0.58402 -0.623 0.5332
PTK6 0.16596 0.76056 0.218 0.8273
RIMKLA -0.51355 1.12269 -0.457 0.6474
SLC7A11 -0.46536 0.44119 -1.055 0.2915
SPIB -0.37485 0.48838 -0.768 0.4428
STAT1 -0.87887 0.81733 -1.075 0.2822
TLR4 0.45989 0.64286 0.715 0.4744
TNF 0.72373 0.98126 0.738 0.4608
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 177.69 on 146 degrees of freedom
Residual deviance: 142.78 on 114 degrees of freedom
AIC: 208.78
Number of Fisher Scoring iterations: 5
Setting levels: control = NR, case = R
Setting direction: controls < cases
Confusion Matrix and Statistics
actual
predicted NR R
NR 17 9
R 26 95
Accuracy : 0.7619
95% CI : (0.6847, 0.8282)
No Information Rate : 0.7075
P-Value [Acc > NIR] : 0.085037
Kappa : 0.3493
Mcnemar's Test P-Value : 0.006841
Sensitivity : 0.3953
Specificity : 0.9135
Pos Pred Value : 0.6538
Neg Pred Value : 0.7851
Prevalence : 0.2925
Detection Rate : 0.1156
Detection Prevalence : 0.1769
Balanced Accuracy : 0.6544
'Positive' Class : NR
Logistic regression with random signatures
random_signatures <- lapply(1:1000,function(i){
sample(unique_ortholog_mapping$ortholog_name,length(signature_human))
})
random_accuracy <- sapply(random_signatures, function(random_signature){
logistic_glm(meta_MI, expr, random_signature)
})
ggplot(data.frame(random_accuracy=random_accuracy), aes(x=random_accuracy)) +
geom_histogram(bins=30) +
geom_vline(xintercept=accuracy, color="red")